In [1]:
%matplotlib inline
from preamble import *
plt.rcParams['savefig.dpi'] = 100 # This controls the size of your figures
# Comment out and restart notebook if you only want the last output of each cell.
InteractiveShell.ast_node_interactivity = "all"
In [2]:
# This is a temporary read-only OpenML key. Replace with your own key later.
oml.config.apikey = '11e82c8d91c5abece86f424369c71590'
SVMs can be trained with different kernels. Generate a 2-dimensional dataset as shown below and study the effect of the choice of kernel by visualizing the results.
In [3]:
from sklearn.datasets import make_blobs
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
X, y = make_blobs(centers=2, n_samples=1000, random_state=0)
svm_lin = SVC(kernel='linear')
svm_pol = SVC(kernel='poly')
svm_rbf = SVC(kernel='rbf')
lin_score = cross_val_score(svm_lin, X, y, cv=10, scoring='roc_auc', n_jobs=-1)
pol_score = cross_val_score(svm_pol, X, y, cv=10, scoring='roc_auc', n_jobs=-1)
rbf_score = cross_val_score(svm_rbf, X, y, cv=10, scoring='roc_auc', n_jobs=-1)
print("Mean 10-CV score of linear kernel: " + str(lin_score.mean()))
print("Mean 10-CV score of polynomial kernel: " + str(pol_score.mean()))
print("Mean 10-CV score of radial basis function kernel: " + str(rbf_score.mean()))
# Using a slightly adapted version of the plot_svm_kernels function from mglearn.
def plot_svm_kernels(X, y):
# figure number
fignum = 1
# fit the model
for kernel in ('linear', 'poly', 'rbf'):
clf = SVC(kernel=kernel, gamma=2)
clf.fit(X, y)
# plot the line, the points, and the nearest vectors to the plane
plt.figure(fignum, figsize=(4, 3))
plt.suptitle('kernel = %s' % kernel)
plt.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1],
s=85, edgecolors='k', c='w', zorder=10)
plt.scatter(X[:, 0], X[:, 1], c=y, zorder=10, cmap=plt.cm.bwr)
# for i, coef in enumerate(clf.dual_coef_[0]):
# plt.annotate("%0.2f" % (coef), (clf.support_vectors_[i, 0]+0.15,clf.support_vectors_[i, 1]), fontsize=8, zorder=11)
plt.axis('tight')
x_min = np.min(X, axis=0)[0] - 1
x_max = np.max(X, axis=0)[0] + 1
y_min = np.min(X, axis=0)[1] - 1
y_max = np.max(X, axis=0)[1] + 1
XX, YY = np.mgrid[x_min:x_max:200j, y_min:y_max:200j]
Z = clf.decision_function(np.c_[XX.ravel(), YY.ravel()])
# Put the result into a color plot
Z = Z.reshape(XX.shape)
plt.figure(fignum, figsize=(4, 3))
#plt.pcolormesh(XX, YY, Z > 0, cmap=plt.cm.bwr, alpha=0.1)
plt.contour(XX, YY, Z, colors=['k', 'k', 'k'], linestyles=['--', '-', '--'],
levels=[-.5, 0, .5])
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
fignum = fignum + 1
plt.show()
plot_svm_kernels(X, y)
As we can see, the linear kernel and the polynomial kernel fits the data very well, whereas the RBF kernel performs the slightly worse. We can see in the figure that the ideal classifier here would be a linear classifier, as the data is somewhat linearly seperable. A polynomial kernel can act like a linear kernel with high level polynomials, so that is why it performs similar to the linear kernel.
But the RBF kernel wants to look for the distance with respect to other points, which does not work so well in this case. It cannot find the optimal linear classifier, but instead tries to look for points near other points.
In [4]:
# First performing a 10CV grid search to obtain the ROC_AUC values.
from sklearn.model_selection import GridSearchCV
param_grid = {
"C" : [2e-15, 2, 2e15],
"gamma" : [2e-15, 2, 2e15]
}
svm_clf = SVC(kernel="rbf")
grid_search = GridSearchCV(svm_clf, param_grid, n_jobs=-1, cv=3, scoring="roc_auc")
_ = grid_search.fit(X, y)
results = pd.DataFrame(grid_search.cv_results_)
scores = np.array(results.mean_test_score)
In [ ]:
# Using a slightly adapted version of the plot_svm_kernels function from mglearn.
def plot_svm_rbf_kernel(X, y, clf, C, gamma):
# figure number
fignum = 1
# plot the line, the points, and the nearest vectors to the plane
plt.figure(fignum, figsize=(4, 3))
plt.suptitle('C = ' + str(C) + ', gamma = ' + str(gamma))
plt.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1],
s=85, edgecolors='k', c='w', zorder=10)
plt.scatter(X[:, 0], X[:, 1], c=y, zorder=10, cmap=plt.cm.bwr)
# for i, coef in enumerate(clf.dual_coef_[0]):
# plt.annotate("%0.2f" % (coef), (clf.support_vectors_[i, 0]+0.15,clf.support_vectors_[i, 1]), fontsize=8, zorder=11)
plt.axis('tight')
x_min = np.min(X, axis=0)[0] - 1
x_max = np.max(X, axis=0)[0] + 1
y_min = np.min(X, axis=0)[1] - 1
y_max = np.max(X, axis=0)[1] + 1
XX, YY = np.mgrid[x_min:x_max:200j, y_min:y_max:200j]
Z = clf.decision_function(np.c_[XX.ravel(), YY.ravel()])
# Put the result into a color plot
Z = Z.reshape(XX.shape)
plt.figure(fignum, figsize=(4, 3))
#plt.pcolormesh(XX, YY, Z > 0, cmap=plt.cm.bwr, alpha=0.1)
plt.contour(XX, YY, Z, colors=['k', 'k', 'k'], linestyles=['--', '-', '--'],
levels=[-.5, 0, .5])
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
plt.show()
# Question: moest je het handmatig scoren op 75-25 of moet je grid_search gebruiken?
# Zo ja, als je grid_search gebruikt, hoe krijg je dan alle estimators terug voor de support vectors?
idx = 0
for C in [2e-15, 2, 2e15]:
for gamma in [2e-15, 2, 2e15]:
svm_clf = SVC(kernel='rbf', C=C, gamma=gamma)
svm_clf.fit(X, y)
print("C value of " + str(C) + ", gamma value of " + str(gamma))
print("Mean test score (10-CV AUC): " + str(results.mean_test_score[idx]))
print("Number of support vectors: " + str(np.size(svm_clf.support_vectors_, axis=0)))
plot_svm_rbf_kernel(X, y, svm_clf, C, gamma)
idx += 1
Out[ ]:
Out[ ]:
Out[ ]:
Out[ ]:
Out[ ]:
Out[ ]:
Out[ ]:
In [ ]:
from sklearn.model_selection import GridSearchCV
param_grid = {
"C" : [2*10**(i) for i in range(-12, 13, 1)],
"gamma" : [2*10**(i) for i in range(-12, 13, 1)]
}
svm_clf = SVC(kernel="rbf")
grid_search = GridSearchCV(svm_clf, param_grid, n_jobs=3, cv=10, scoring="roc_auc")
_ = grid_search.fit(X, y)
# For each of the 9 combinations, create the same RBF plot as before, report the number of support vectors, and the AUC performance.
In [ ]:
results = pd.DataFrame(grid_search.cv_results_)
scores = np.array(results.mean_test_score).reshape(24, 24)
plt.figure(figsize=[8, 8])
# Plots the mean cross-validation scores
mglearn.tools.heatmap(scores, xlabel='Gamma', xticklabels=param_grid["gamma"],
ylabel='C', yticklabels=param_grid["C"], cmap="viridis");
The Wall Robot Navigation dataset contains about 5500 readings of an ultrasound sensor array mounted on a robot, and your task is to finetune and train an SVM classifier to predict how the robot should move next.
In [3]:
robot_data = oml.datasets.get_dataset(1497) # Download Robot data
# Get the predictors X and the labels y
X, y = robot_data.get_data(target=robot_data.default_target_attribute);
In [11]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
# We'll use stratify=None for the built-in stratify of train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
param_grid = [
{
'kernel':['linear'],
'C' : [2*10**(i) for i in range(-12, 13, 1)],
},
{
'kernel':['poly'],
'degree': [i for i in range(2, 11)],
'gamma' : [2*10**(i) for i in range(-12, 13, 1)],
'C' : [2*10**(i) for i in range(-12, 13, 1)],
},
{
'kernel':['rbf', 'sigmoid'],
'C' : [2*10**(i) for i in range(-12, 13, 1)],
'gamma' : [2*10**(i) for i in range(-12, 13, 1)]
}
]
random_search = RandomizedSearchCV(SVC(), param_distributions=param_grid, n_iter=30, n_jobs=-1, cv=3)
random_search.fit(X_train, y_train)
print("Best Score (3CV accuracy): " + str(best_score_))
print("Best parameters: ")
print(best_params_)
In [10]:
param_grid
Out[10]:
A benchmark study is an experiment in which multiple algorithms are evaluated on multiple datasets. The end goal is to study whether one algorithm is generally better than the others. Meaningful benchmark studies can grow quite complex, here we do a simplified variant.
Consider the RAM prices dataset (included in the data folder). Separate the data in a training set of all data points up until the year 2000, and a test set with all points after that.
In [3]:
ram_prices = pd.read_csv('data/ram_price.csv')
plt.semilogy(ram_prices.date, ram_prices.price)
plt.xlabel("Year")
plt.ylabel("Price in $/Mbyte");
The goal here is to use everything you have learned to build the best model for a given classification task. The task is hosted on OpenML, so you will receive the train-test splits, and your model will be evaluated on the server. The goal is to reasonably select algorithms and hyperparameter settings to obtain the best model. You can also do model selection and parameter optimization as you have done before. Skeleton code is provided in the OpenML tutorial.
Note: Report AUC scores in your report as well. In case of issues with OpenML we will use the experiments and scores mentioned your report.
In [ ]: